Analysis Overview#
In this project, we aim to answer:
Which locations (and even streets, boroughs, and zip codes) in NY have a higher proportion of car accidents?
TODO
TODO
# !pip install geopandas
# !pip install geoplot
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from tool import utils
import seaborn as sns
%matplotlib inline
pd.set_option('display.max_columns', 50)
from datetime import datetime
# set the filename
# filename = "Motor_Vehicle_Collisions_-_Crashes.csv"
Importing Data#
The dataset we are working with is from the City of New York’s data catalog on motor vehicle crashes. The data contains information from all police reported motor vehicle collisions in NYC in 2012-2023. For the purposes of memory, we only use 2022 data. Click here to download the complete dataset. Be aware that this dataset is quite large and is of 403 Mb.
# important! Only run this cell if you want to download and read the original data file "Motor_Vehicle_Collisions_-_Crashes.csv"
# Otherwise, if you already have the yearly data in your data folder
# # read the csv file into dataframe
# df = pd.read_csv(f"data/{filename}", low_memory=False)
# # set the type of CRASH DATE into datetime
# df['CRASH DATE'] = pd.to_datetime(df['CRASH DATE'])
# # Extract yearly data and write into seperate csv files
# utils.getYearlyData(df, filename)
# read csv into dataframe, specifying the year
crashes = utils.read_csv_of_year(2022)
crashes.head(5)
| Unnamed: 0 | CRASH DATE | CRASH TIME | BOROUGH | ZIP CODE | LATITUDE | LONGITUDE | LOCATION | ON STREET NAME | CROSS STREET NAME | OFF STREET NAME | NUMBER OF PERSONS INJURED | NUMBER OF PERSONS KILLED | NUMBER OF PEDESTRIANS INJURED | NUMBER OF PEDESTRIANS KILLED | NUMBER OF CYCLIST INJURED | NUMBER OF CYCLIST KILLED | NUMBER OF MOTORIST INJURED | NUMBER OF MOTORIST KILLED | CONTRIBUTING FACTOR VEHICLE 1 | CONTRIBUTING FACTOR VEHICLE 2 | CONTRIBUTING FACTOR VEHICLE 3 | CONTRIBUTING FACTOR VEHICLE 4 | CONTRIBUTING FACTOR VEHICLE 5 | COLLISION_ID | VEHICLE TYPE CODE 1 | VEHICLE TYPE CODE 2 | VEHICLE TYPE CODE 3 | VEHICLE TYPE CODE 4 | VEHICLE TYPE CODE 5 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2022-03-26 | 11:45 | NaN | NaN | NaN | NaN | NaN | QUEENSBORO BRIDGE UPPER | NaN | NaN | 1.0 | 0.0 | 0 | 0 | 0 | 0 | 1 | 0 | Pavement Slippery | NaN | NaN | NaN | NaN | 4513547 | Sedan | NaN | NaN | NaN | NaN |
| 1 | 2 | 2022-06-29 | 6:55 | NaN | NaN | NaN | NaN | NaN | THROGS NECK BRIDGE | NaN | NaN | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 0 | 0 | Following Too Closely | Unspecified | NaN | NaN | NaN | 4541903 | Sedan | Pick-up Truck | NaN | NaN | NaN |
| 2 | 34 | 2022-06-29 | 16:00 | NaN | NaN | NaN | NaN | NaN | WILLIAMSBURG BRIDGE OUTER ROADWA | NaN | NaN | 1.0 | 0.0 | 0 | 0 | 0 | 0 | 1 | 0 | Driver Inattention/Distraction | Unspecified | NaN | NaN | NaN | 4542336 | Motorscooter | Station Wagon/Sport Utility Vehicle | NaN | NaN | NaN |
| 3 | 37 | 2022-07-12 | 17:50 | BROOKLYN | 11225.0 | 40.663303 | -73.96049 | (40.663303, -73.96049) | NaN | NaN | 44 EMPIRE BOULEVARD | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 0 | 0 | Oversized Vehicle | Unspecified | NaN | NaN | NaN | 4545699 | Sedan | NaN | NaN | NaN | NaN |
| 4 | 38 | 2022-03-23 | 10:00 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 71 EAST DRIVE | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 0 | 0 | Pedestrian/Bicyclist/Other Pedestrian Error/Co... | NaN | NaN | NaN | NaN | 4512922 | Bike | NaN | NaN | NaN | NaN |
Mapping#
Dropping NaNs in Locations column#
crashes["LOCATION"].isna().sum()
8938
crashes["LOCATION"].isna().sum()/crashes.shape[0]*100
8.611868538448938
crashes_locations_map = crashes.dropna(subset=["LOCATION"])
# crashes_locations_map.head()
Plotting total casualties#
crashes_locations_map["TOTAL CASUALTIES"] = crashes_locations_map["NUMBER OF PERSONS INJURED"] + crashes_locations_map["NUMBER OF PERSONS KILLED"]
<ipython-input-13-bfc0c353cf1d>:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
crashes_locations_map["TOTAL CASUALTIES"] = crashes_locations_map["NUMBER OF PERSONS INJURED"] + crashes_locations_map["NUMBER OF PERSONS KILLED"]
crashes_locations_map_2plus = crashes_locations_map[crashes_locations_map["TOTAL CASUALTIES"] > 2]
fig = px.scatter_mapbox(crashes_locations_map_2plus,
lat="LATITUDE",
lon="LONGITUDE",
color="TOTAL CASUALTIES",
size="TOTAL CASUALTIES",
zoom=8,
height=800,
width=1000, color_continuous_scale='thermal')
fig.update_layout(mapbox_style="open-street-map")
fig.show()
Clustering Analysis#
features = crashes_locations_map[['LATITUDE', 'LONGITUDE']]
features
| LATITUDE | LONGITUDE | |
|---|---|---|
| 3 | 40.663303 | -73.960490 |
| 5 | 40.607685 | -74.138920 |
| 6 | 40.855972 | -73.869896 |
| 7 | 40.790276 | -73.939600 |
| 8 | 40.642986 | -74.016210 |
| ... | ... | ... |
| 103782 | 0.000000 | 0.000000 |
| 103783 | 40.877476 | -73.836610 |
| 103784 | 40.690180 | -73.935600 |
| 103785 | 40.694485 | -73.937350 |
| 103786 | 40.687750 | -73.790390 |
94849 rows × 2 columns
from sklearn.cluster import KMeans
# create kmeans model/object
kmeans = KMeans(
init="random",
n_clusters=10,
n_init=10,
max_iter=300,
random_state=42
)
# do clustering
kmeans.fit(features)
# save results
labels = kmeans.labels_
# send back into dataframe and display it
crashes_locations_map['cluster'] = labels
# display the number of mamber each clustering
_clusters = crashes_locations_map.groupby('cluster').count()['COLLISION_ID']
print(_clusters)
cluster
0 10970
1 7555
2 7267
3 13722
4 14850
5 10719
6 9529
7 13749
8 4624
9 1864
Name: COLLISION_ID, dtype: int64
<ipython-input-21-6235f9a12ff7>:2: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
fig = px.scatter_mapbox(crashes_locations_map,
lat="LATITUDE",
lon="LONGITUDE",
color="cluster",
size="TOTAL CASUALTIES",
zoom=8,
height=800,
width=1000, color_continuous_scale='thermal')
fig.update_layout(mapbox_style="open-street-map")
fig.show()
EDA#
Collisions by Geography#
# function to add value labels
# credit: GeeksforGeeks
def addlabels(x, y):
for i in range(len(x)):
plt.text(i, y[i]+200, y[i], ha="center")
Boroughs#
boroughs_data = crashes.groupby("BOROUGH").count()["COLLISION_ID"]
boroughs_data.index
Index(['BRONX', 'BROOKLYN', 'MANHATTAN', 'QUEENS', 'STATEN ISLAND'], dtype='object', name='BOROUGH')
x = boroughs_data.index
y = boroughs_data.values
plt.bar(x, y)
addlabels(x, y)
plt.xlabel("Boroughs")
plt.ylabel("Count")
plt.title("Collisions by Borough")
plt.show()
Zip Code#
TODO: Discuss - How to display this data more visually appealingly?
zc_data = crashes.groupby("ZIP CODE").count()["CRASH DATE"]
zc_df = pd.DataFrame(zc_data).reset_index()
zc_df.rename(columns={"CRASH DATE": "COUNT"}, inplace=True)
zc_df["ZIP CODE"] = zc_df["ZIP CODE"].astype(int)
zc_df.sort_values(by="COUNT", ascending=False)
| ZIP CODE | COUNT | |
|---|---|---|
| 123 | 11207 | 1763 |
| 128 | 11212 | 1128 |
| 151 | 11236 | 1111 |
| 124 | 11208 | 1099 |
| 119 | 11203 | 996 |
| ... | ... | ... |
| 55 | 10153 | 1 |
| 56 | 10154 | 1 |
| 58 | 10158 | 1 |
| 59 | 10165 | 1 |
| 57 | 10155 | 1 |
210 rows × 2 columns

Collisions by Demography#
# credit: https://matplotlib.org/stable/gallery/lines_bars_and_markers/barchart.html
types = ("Pedestrians", "Motorists", "Cyclists")
counts = {"Injured": (crashes["NUMBER OF PEDESTRIANS INJURED"].sum(), crashes["NUMBER OF MOTORIST INJURED"].sum(), crashes["NUMBER OF CYCLIST INJURED"].sum()),
"Killed": (crashes["NUMBER OF PEDESTRIANS KILLED"].sum(), crashes["NUMBER OF MOTORIST KILLED"].sum(), crashes["NUMBER OF CYCLIST KILLED"].sum()),
}
x = np.arange(len(types)) # the label locations
width = 0.25 # the width of the bars
multiplier = 0
fig, ax = plt.subplots(layout='constrained')
for i, j in counts.items():
offset = width * multiplier
rects = ax.bar(x + offset, j, width, label=i)
ax.bar_label(rects, padding=3)
multiplier += 1
ax.set_ylabel('Count')
ax.set_title('Type of person')
ax.set_xticks(x + width*0.5, types)
ax.legend()
ax.set_ylim(0, 42000)
plt.show()
Collisions by Causes#
TODO:
Discuss - What is the difference amongst cols 1-5 of Contributing Factor Vehicle and Vehicle Type Code?
Should we stratify by borough?
Top 10 Contributing Factor Vehicle 1#
cfv = utils.get_contributing_factor(crashes).sort_values("CRASH DATE", ascending=False).iloc[1:11]
labels = cfv.index
sizes = cfv["CRASH DATE"]
fig, ax = plt.subplots(figsize=(10, 7))
ax.pie(sizes, labels=labels, autopct='%1.1f%%')
# plt.legend(labels, bbox_to_anchor=(1.5, 0.25, 0.5, 0.5))
plt.show()
# For the remaining Contributing Vehicle Factor columns
# the top most reason is "Unspecified" so not very helpful visualizations
Top 10 Vehicle Type Code 1#
TODO: Maybe calculate the most common vehicle type codes across all 5 columns?
# vtc1 = crashes.groupby("VEHICLE TYPE CODE 1").count()["CRASH DATE"].sort_values(ascending=False).head(10)
vtc = utils.get_vehicle_type(crashes).sort_values("CRASH DATE", ascending=False).iloc[:10]
labels = vtc.index
sizes = vtc["CRASH DATE"]
fig, ax = plt.subplots(figsize=(10, 7))
ax.pie(sizes, labels=labels, autopct='%1.1f%%')
plt.show()
Collisions by Time#
crashes['CRASH DATE'] = pd.to_datetime(crashes['CRASH DATE']) # , format='%m/%d/%Y')
month = []
day = []
for i in crashes["CRASH DATE"]:
month.append(i.strftime("%m"))
day.append(i.strftime("%d"))
crashes["CRASH MONTH"] = month
crashes["CRASH DAY"] = day
crashes["CRASH HOUR"] = pd.to_datetime(crashes['CRASH TIME'], format='%H:%M').dt.hour
Month#
month_data = crashes.groupby("CRASH MONTH").count()["CRASH TIME"]
plt.figure(figsize=(8, 6))
# sns.lineplot(month_data.index, month_data.values)
plt.plot(month_data.index, month_data.values)
plt.title("Collisions by Month")
plt.xlabel("Month")
plt.ylabel("Count")
plt.show()
Day#
day_data = crashes.groupby("CRASH DAY").count()["CRASH TIME"]
plt.figure(figsize=(8, 6))
plt.plot(day_data.index, day_data.values)
plt.title("Collisions by Day")
plt.xlabel("Day")
plt.ylabel("Count")
plt.show()
Hour#
hour_data = crashes.groupby("CRASH HOUR").count()["CRASH TIME"]
plt.figure(figsize=(8, 6))
plt.plot(hour_data.index, hour_data.values)
plt.title("Collisions by Hour")
plt.xlabel("Hour")
plt.ylabel("Count")
plt.xticks(np.linspace(0, 23, 24))
plt.show()
Bivariate analysis:#
Investigate relationships between pairs of features, using scatter plots, box plots, or violin plots. For example, you can explore the relationship between contributing factors and the number of persons injured, or between crash time and crash severity.
crashes_18_22 = dict.fromkeys(range(2018, 2023))
for year in crashes_18_22.keys():
crashes_18_22[year] = utils.read_csv_of_year(year)
utils.time_processing(crashes_18_22[year])
α = 1
crashes_18_22[year]["TOTAL CASUALTIES"] = crashes_18_22[year]["NUMBER OF PERSONS INJURED"] + α * crashes_18_22[year]["NUMBER OF PERSONS KILLED"]
casualties_18_22 = dict.fromkeys(range(2018, 2023))
for year in casualties_18_22.keys():
casualties_18_22[year] = crashes_18_22[year].groupby("CRASH HOUR")["TOTAL CASUALTIES"].mean()
plt.figure(figsize=(8, 6))
for year in casualties_18_22.keys():
plt.plot(casualties_18_22[year].index, casualties_18_22[year].values, label=year)
plt.title("Casualties by Hour")
plt.xlabel("Hour")
plt.ylabel("Count")
plt.xticks(np.linspace(0, 23, 24))
plt.legend()
plt.show()